# `suppressPackageStartupMessages` is a function in R that suppresses the startup messages that are generated when loading a package. It can be useful to prevent the console from being cluttered with messages when you are loading multiple packages or when you don't want to see the package loading messages.
shhh <- suppressPackageStartupMessages # It's a library, so shhh!


shhh(library( mgcv ))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(lme4))
shhh(library(tidymv))
shhh(library(gamlss))
shhh(library(gsubfn))
shhh(library(lmerTest))
shhh(library(tidyverse))
shhh(library(boot))
shhh(library(rsample))
shhh(library(plotrix))
shhh(library(ggrepel))
shhh(library(mgcv))

# `theme_set(theme_bw())` is a command in R that sets the default theme for all subsequent plots to a theme called `theme_bw()`.
# `theme_bw()` is a built-in theme in the ggplot2 package that provides a classic black-and-white theme with white grid lines. By setting the default theme to `theme_bw()`, all plots that are created using ggplot2 will have this theme unless a different theme is explicitly specified.
theme_set(theme_bw())

# `options(digits=4)` is a command in R that sets the number of digits to display when printing numeric values.

# By default, R will print up to 7 digits for numeric values. However, you can use the `options()` function to change this behavior. In this case, `options(digits=4)` sets the number of digits to 4, which means that all numeric values will be displayed with at most 4 digits.
options(digits=4)
set.seed(444)

# The `pipe_message` function first outputs the status message using the message() function. This can be useful for tracking the progress of a long computation or for providing information to the user.
# After printing the message, the function returns the .data argument unchanged. This allows the function to be used as a "pipe" in a data processing pipeline, where the output of one function is used as the input to the next function.
pipe_message = function(.data, status) {message(status); .data}

Read in MoTR Data

# Interesting way of reading multiple files in one directory!

file_prefix = "../reading_measures/cleaned_f160"
fnames = list.files(path=file_prefix)

# fnames: [1] "reader_55_reading_measures.csv" "reader_56_reading_measures.csv"
# [3] "reader_59_reading_measures.csv" "reader_61_reading_measures.csv"

# Read in the data
df = data.frame()

# uses the mutate() function from the dplyr package to create a new column in a data frame called subj. The new column is created by removing the substring _reading_measures.csv from the values in an existing column called f.
for (f in fnames) {
  temp = read.csv(paste0(file_prefix, "/", f)) %>%
    mutate(subj = str_remove(f, "_reading_measures.csv")) %>%
    dplyr::select(expr_id, cond_id, para_nr, word, word_nr, first_duration, total_duration,
                  gaze_duration, go_pass_time, FPReg, subj) %>%
    rename(go_past_time = go_pass_time)
  df = rbind(df, temp)
}
df

# `gather will take column 6-9's column name make it rows, name 'metric', take their value as another row`
motr_df = df %>%
  mutate(expr_id = if_else(expr_id == 1, "Attachment", "Provo")) %>%
  gather(metric, value, 6:10)

motr_df
NA
# Average across subjects
motr_agg_df = motr_df %>%
  drop_na() %>%
  group_by(expr_id, cond_id, para_nr, word, word_nr, metric) %>%
    summarise(value = mean(value)) %>%    # sum up four subjects for each metric and divide by 4.
  ungroup() %>%
  arrange(expr_id, cond_id, para_nr, word_nr)   # like sort in python
`summarise()` has grouped output by 'expr_id', 'cond_id', 'para_nr', 'word', 'word_nr'. You can override using the `.groups` argument.
# View(motr_agg_df)

motr_agg_df

table(motr_df$subj)

reader_64 reader_66 reader_67 reader_68 reader_70 
    11990     11990     11990     11990     11990 
# reader_55 reader_56 reader_59 reader_61 
#    3100      3100      3100      3100

motr_provo_df = motr_agg_df %>%
  filter(expr_id == "Provo") %>%
  rename(text_id = para_nr,
         word_text_idx = word_nr,
         motr_value = value) %>%
  dplyr::select(-expr_id, -cond_id)

motr_provo_df
NA

Comparison to Provo

# Read in Provo surprisal, frequency and length data
provo_modeling_df = read.csv("../ancillary_data/provo_df.csv") %>%
  dplyr::select(text_id, sent_id, trigger_idx, word, freq, surp, len) %>%
  rename(word_idx = trigger_idx)

provo_modeling_df
NA
# Read in Provo eyetracking data

provo_raw_df = read.csv("../ancillary_data/provo_eyetracking.csv")

provo_eyetracking_df = provo_raw_df %>%
  dplyr::select(Participant_ID, Text_ID, Sentence_Number, Word_In_Sentence_Number, IA_ID, Word,IA_FIRST_FIXATION_DURATION, IA_FIRST_FIX_PROGRESSIVE, IA_FIRST_RUN_DWELL_TIME, IA_DWELL_TIME, IA_REGRESSION_PATH_DURATION, IA_REGRESSION_OUT) %>%
  rename( first_duration = IA_FIRST_FIXATION_DURATION,    # whether it is first pass?
          gaze_duration = IA_FIRST_RUN_DWELL_TIME,
          total_duration = IA_DWELL_TIME,
          go_past_time = IA_REGRESSION_PATH_DURATION,
          FPReg = IA_REGRESSION_OUT,
          subj = Participant_ID,
          text_id = Text_ID,
          sent_id = Sentence_Number,
          word_idx = Word_In_Sentence_Number,
          word_text_idx = IA_ID,
          word = Word,
          ff_progressive = IA_FIRST_FIX_PROGRESSIVE) %>% # notice:average across subj, binary(0,1) becomes float.
  mutate(gaze_duration = ifelse(ff_progressive == 0, 0, gaze_duration),
         go_past_time = ifelse(ff_progressive == 0, 0, go_past_time)) %>%
  dplyr::select(-ff_progressive) %>%
  gather(metric, value, 7:11) %>%      # not include FFReg which is in column 11
  mutate(value = if_else(is.na(value), as.integer(0), as.integer(value))) %>%
  drop_na() %>%   # actually, drop first word in a sentence
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
  summarise(value = mean(value)) %>%
  ungroup()
`summarise()` has grouped output by 'text_id', 'word_text_idx', 'sent_id', 'word_idx', 'word'. You can override using the `.groups` argument.
# View(provo_eyetracking_df)
provo_eyetracking_df
NA
provo_df = merge(provo_eyetracking_df, provo_modeling_df, by=c("text_id", "sent_id", "word_idx")) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  arrange(text_id, sent_id, word_idx)
provo_df

provo_df = merge(provo_df, motr_provo_df, by=c("text_id", "word_text_idx", "metric")) %>%
  rename(eyetr_value = value) %>%
  arrange(text_id, sent_id, word_idx) %>%
  filter(word.x == word) %>%      #word.y has no captical word
  dplyr::select(-word.x, -word.y) %>%
  mutate(motr_outlier = if_else(motr_value > (mean(motr_value) + 3 * sd(motr_value) ), T, F)) %>%
  filter(motr_outlier == F) %>%     # clear outlier -> 13 was filtered.
  gather(measure, value, c("eyetr_value", "motr_value"))  %>%
  # filter(metric != "first_duration") %>%
  filter(metric != "FPReg")

# View(provo_df)
provo_df %>%
  ggplot(aes(x = value, color=metric)) +
    geom_density() +
    facet_wrap(.~measure) +
    xlab("Reading Measure")
Error in `combine_vars()`:
! Faceting variables must have at least one value
Backtrace:
 1. base (local) `<fn>`(x)
 2. ggplot2:::print.ggplot(x)
 4. ggplot2:::ggplot_build.ggplot(x)
 5. layout$setup(data, plot$data, plot$plot_env)
 6. ggplot2 (local) setup(..., self = self)
 7. self$facet$compute_layout(data, self$facet_params)
 8. ggplot2 (local) compute_layout(..., self = self)
 9. ggplot2::combine_vars(data, params$plot_env, vars, drop = params$drop)

provo_df %>%
  filter(measure == "motr_value") %>%
  ggplot(aes(x = value, color=metric)) +
    geom_density() +
    xlab("Reading Measure") +
    ggtitle("Density plot of value for motr")

provo_df %>%
  filter(measure == "eyetr_value") %>%
  ggplot(aes(x = value, color=metric)) +
    geom_density() +
    xlab("Reading Measure") +
    ggtitle("Density plot of value for eyetr")

gd_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
# View(gd_df)

cor.test(gd_df$eyetr_value, gd_df$motr_value)          #cor rise from 0.529 to 0.5989 

    Pearson's product-moment correlation

data:  gd_df$eyetr_value and gd_df$motr_value
t = 18, df = 377, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6113 0.7226
sample estimates:
   cor 
0.6707 
provo_df %>%
  spread(measure, value) %>%
  ggplot(aes(x = motr_value, y=eyetr_value, color=metric)) +
    geom_point(alpha = 0.2) +
    facet_wrap(.~metric, scales="free") +
    geom_smooth()


# ggsave("../visualization/metric_cor.png", device = "png", width = 6, height = 2.5)
gd_df_2 = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)

cor.test(gd_df_2$eyetr_value, gd_df_2$motr_value)

    Pearson's product-moment correlation

data:  gd_df_2$eyetr_value and gd_df_2$motr_value
t = 18, df = 379, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6220 0.7305
sample estimates:
 cor 
0.68 
gd_df_3 = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)

cor.test(gd_df_3$eyetr_value, gd_df_3$motr_value)

    Pearson's product-moment correlation

data:  gd_df_3$eyetr_value and gd_df_3$motr_value
t = 19, df = 379, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6406 0.7447
sample estimates:
   cor 
0.6963 
gd_df_4 = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)

cor.test(gd_df_4$eyetr_value, gd_df_4$motr_value)

    Pearson's product-moment correlation

data:  gd_df_4$eyetr_value and gd_df_4$motr_value
t = 14, df = 368, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5197 0.6530
sample estimates:
   cor 
0.5903 
provo_df %>%
  gather(word_prop, word_prop_val, c("freq", "len", "surp")) %>%
  filter(metric == "gaze_duration") %>%
  ggplot(aes(x = value, y=word_prop_val, color = measure)) +
    geom_point(alpha = 0.2) +
    facet_grid(word_prop~measure, scales="free") +
    geom_smooth() +
    xlab("Reading Measure")


# ggsave("../visualization/word_prop_comps.png", device = "png", width = 6, height = 3)
provo_df %>%
  ggplot(aes(x = value, y=freq, color=metric)) +
    geom_point(alpha = 0.2) +
    facet_grid(metric~measure, scales="free") +
    geom_smooth()

provo_df %>%
  ggplot(aes(x = value, y=surp, color=metric)) +
    geom_point(alpha = 0.2) +
    facet_grid(metric~measure, scales="free") +
    geom_smooth()

Shape of surprisal / RT relationship


fit_gam_inner = function(bootstrap_sample, mean_predictors) {
  
  df = bootstrap_sample$data
  weights = tabulate(as.integer(bootstrap_sample), nrow(df))
  
  m = gam(psychometric ~ s(surp, bs = 'cr', k = 6) + s(prev_surp, bs = 'cr', k = 6) + te(freq, len, bs = 'cr') + te(prev_freq, prev_len, bs = 'cr'), data = df, weights = weights)
  terms_to_predict = c("s(surp)", "s(prev_surp)")
  
  newdata = data.frame(surp=seq(0,20,by=0.1), prev_surp=mean_predictors$surp,
                       freq=mean_predictors$freq, prev_freq=mean_predictors$freq,
                       len=mean_predictors$freq, prev_len=mean_predictors$freq)

  # Returns a matrix N_samples * N_terms.
  per_term_predictions = predict(m, newdata=newdata, terms=terms_to_predict, type="terms")

  # Additive model -- sum across predictor response contributions (matrix columns).
  predictions = rowSums(per_term_predictions)

  return(newdata %>% mutate(y=predictions))
}

fit_gam = function(df, mean_predictors, alpha=0.05) {
  # Bootstrap-resample data
  boot_models = df %>% bootstraps(times=10) %>% 
   # Fit a GAM and get predictions for each sample
    mutate(smoothed=map(splits, fit_gam_inner, mean_predictors=mean_predictors))
  
  # Extract mean and 5% and 95% percentile y-values for each surprisal value
  result = boot_models %>% 
    unnest(smoothed) %>% 
    dplyr::select(surp, y) %>% 
    group_by(surp) %>% 
      summarise(y_lower=quantile(y, alpha / 2), 
                y_upper=quantile(y, 1 - alpha / 2),
                y=mean(y)) %>% 
    ungroup()
  
  return (result)
}

gam_modeling_df = provo_df %>%
  spread(measure, value) %>%
  mutate(len = nchar(word)) %>%
  group_by(metric, text_id) %>%
    arrange(word_text_idx) %>%
    mutate(prev_surp = lag(surp),
           prev_freq = lag(freq),
           prev_len = lag(len),
           prev_eyetr_value = lag(eyetr_value)) %>%
  ungroup() %>%
  drop_na() %>%
  rename(psychometric = motr_value)


smooths_df = data.frame()

metrics = c("gaze_duration", "total_duration", "go_past_time", "first_duration")
for (m in metrics) {
  print(paste0("Fitting model for ", m))
  dummy_df = gam_modeling_df %>% filter(metric == m)
  mean_predictors = dummy_df %>% summarise(surp = mean(surp), len = mean(len), freq = mean(freq))
  smooths = dummy_df %>% fit_gam(., mean_predictors)
  #Fix 0 surprisal = 0 ms
  gam_smooths = smooths %>% mutate(delta = 0 - y[1], y=y + delta, y_lower= y_lower + delta, y_upper=y_upper + delta)
  smooths_df = rbind(smooths_df, gam_smooths %>% mutate(psychometric = m))
}
[1] "Fitting model for gaze_duration"
[1] "Fitting model for total_duration"
[1] "Fitting model for go_past_time"
[1] "Fitting model for first_duration"

Targeted Evaluation Data

# View(motr_df)

motr_attach_df = motr_df %>%
  filter(expr_id == "Attachment") %>%
  rename( item_id = para_nr) %>%
  mutate(item_id = as.integer(item_id)) %>%
  mutate(cond_id = as.factor(cond_id)) %>%
  mutate(cond_id = if_else(cond_id == 1, "No Comma", 
                          if_else(cond_id == 2, "Comma", 
                                  if_else(cond_id == 3, "adv_high",
                                          if_else(cond_id == 4, "adv_low",
                                                  if_else(cond_id == 5, "relative_high",
                                                          if_else(cond_id == 6, "relative_low",
                                                                  if_else(cond_id == 7, "practice", "filler")))))))) %>%
  filter(metric != "FPReg") %>%
  filter(cond_id == "relative_high" | cond_id == "relative_low") %>%

  # filter(! (item_id == 4 & cond_id == "No Comma") ) %>% # just because of alignment issues for now
  
  mutate(crit = if_else(word == "who", word_nr, as.integer(0) )) %>%
  group_by(cond_id, item_id) %>%
    mutate(crit = unique(crit)[2]) %>%
  ungroup() %>%
  mutate(word_nr = word_nr - crit)
View(motr_attach_df)


agg_motr_attach_df = motr_attach_df %>%
  drop_na() %>%
  filter(word_nr >= -2, word_nr < 6) %>%
  group_by(cond_id, word_nr, metric) %>%
    summarise( m = mean(value),
               sd = std.error(value),
               upper = m + 1.96 * sd,
               lower = m - 1.98 * sd,
               n = n()) %>%
  ungroup()
`summarise()` has grouped output by 'cond_id', 'word_nr'. You can override using the `.groups` argument.
View(agg_motr_attach_df)

agg_motr_attach_df %>%
  ggplot(aes(x = word_nr, y = m, color = cond_id)) +
    geom_rect(aes(xmin = 2.5, xmax = 5.5, ymin = 100, ymax = 800), fill=alpha("white", 0), color = "#45ef70", linetype = "dotted") +
    geom_point() +
    geom_errorbar(aes(ymax = upper, ymin = lower), width = 0.3) +
    geom_line() +
    #geom_text(aes(label = word, y = if_else(cond_id == "Comma", 3000, 3500)), size = 2) +
    #facet_grid(para_nr~cond_id) +
  ylab("Reading Time") +
  xlab("Condition") +
  # scale_x_continuous(breaks=-2:5, labels=c("the", "man", "and", "his", "wife", "ran", "away", "from")) +
  scale_x_continuous(breaks=-2:5, labels=c('the', 'queen', 'who', 'praised', 'herself', 'all', 'the', 'time')) +
  facet_grid(~metric) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )


#ggsave("../visualization/attachment.png", device = "png", width = 6, height = 3)

options(JULIA_HOME = "/Applications/Julia-1.8.app/Contents/Resources/julia/bin/")
# library(jglmm)
# jglmm_setup()

attach_lm_df = motr_attach_df %>%
  filter(metric == "gaze_duration") %>%
  filter(word_nr == 3) %>%
  mutate(item_id = as.factor(item_id),
         subj = as.factor(subj))

m = attach_lm_df %>%
  lmer(value ~ cond_id + (cond_id | item_id) + (cond_id | subj), data=.)
boundary (singular) fit: see help('isSingular')
summary(m)
Linear mixed model fit by REML. t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: value ~ cond_id + (cond_id | item_id) + (cond_id | subj)
   Data: .

REML criterion at convergence: 1112

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-1.797 -0.639 -0.280  0.523  3.497 

Random effects:
 Groups   Name                Variance Std.Dev. Corr 
 item_id  (Intercept)         25837    160.7         
          cond_idrelative_low  2360     48.6    -1.00
 subj     (Intercept)           948     30.8         
          cond_idrelative_low 14451    120.2    1.00 
 Residual                     63112    251.2         
Number of obs: 80, groups:  item_id, 24; subj, 5

Fixed effects:
                    Estimate Std. Error    df t value Pr(>|t|)    
(Intercept)            252.1       53.8  15.5    4.68  0.00027 ***
cond_idrelative_low    -18.4       79.1   5.1   -0.23  0.82560    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr)
cnd_drltv_l -0.290
optimizer (nloptwrap) convergence code: 0 (OK)
boundary (singular) fit: see help('isSingular')
---
title: "Exploratory Analysis for MoTR Reading Data"
output: html_notebook
---

```{r}
# `suppressPackageStartupMessages` is a function in R that suppresses the startup messages that are generated when loading a package. It can be useful to prevent the console from being cluttered with messages when you are loading multiple packages or when you don't want to see the package loading messages.
shhh <- suppressPackageStartupMessages # It's a library, so shhh!


shhh(library( mgcv ))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(lme4))
shhh(library(tidymv))
shhh(library(gamlss))
shhh(library(gsubfn))
shhh(library(lmerTest))
shhh(library(tidyverse))
shhh(library(boot))
shhh(library(rsample))
shhh(library(plotrix))
shhh(library(ggrepel))
shhh(library(mgcv))

# `theme_set(theme_bw())` is a command in R that sets the default theme for all subsequent plots to a theme called `theme_bw()`.
# `theme_bw()` is a built-in theme in the ggplot2 package that provides a classic black-and-white theme with white grid lines. By setting the default theme to `theme_bw()`, all plots that are created using ggplot2 will have this theme unless a different theme is explicitly specified.
theme_set(theme_bw())

# `options(digits=4)` is a command in R that sets the number of digits to display when printing numeric values.

# By default, R will print up to 7 digits for numeric values. However, you can use the `options()` function to change this behavior. In this case, `options(digits=4)` sets the number of digits to 4, which means that all numeric values will be displayed with at most 4 digits.
options(digits=4)
set.seed(444)

# The `pipe_message` function first outputs the status message using the message() function. This can be useful for tracking the progress of a long computation or for providing information to the user.
# After printing the message, the function returns the .data argument unchanged. This allows the function to be used as a "pipe" in a data processing pipeline, where the output of one function is used as the input to the next function.
pipe_message = function(.data, status) {message(status); .data}

```

```{r}

```

# Read in MoTR Data

```{r}
# Interesting way of reading multiple files in one directory!

file_prefix = "../reading_measures/cleaned_f160"
fnames = list.files(path=file_prefix)

# fnames: [1] "reader_55_reading_measures.csv" "reader_56_reading_measures.csv"
# [3] "reader_59_reading_measures.csv" "reader_61_reading_measures.csv"

# Read in the data
df = data.frame()

# uses the mutate() function from the dplyr package to create a new column in a data frame called subj. The new column is created by removing the substring _reading_measures.csv from the values in an existing column called f.
for (f in fnames) {
  temp = read.csv(paste0(file_prefix, "/", f)) %>%
    mutate(subj = str_remove(f, "_reading_measures.csv")) %>%
    dplyr::select(expr_id, cond_id, para_nr, word, word_nr, first_duration, total_duration,
                  gaze_duration, go_pass_time, FPReg, subj) %>%
    rename(go_past_time = go_pass_time)
  df = rbind(df, temp)
}
df

# `gather will take column 6-9's column name make it rows, name 'metric', take their value as another row`
motr_df = df %>%
  mutate(expr_id = if_else(expr_id == 1, "Attachment", "Provo")) %>%
  gather(metric, value, 6:10)

motr_df

```

```{r}
# Average across subjects
motr_agg_df = motr_df %>%
  drop_na() %>%
  group_by(expr_id, cond_id, para_nr, word, word_nr, metric) %>%
    summarise(value = mean(value)) %>%    # sum up four subjects for each metric and divide by 4.
  ungroup() %>%
  arrange(expr_id, cond_id, para_nr, word_nr)   # like sort in python

# View(motr_agg_df)

motr_agg_df

table(motr_df$subj)
# reader_55 reader_56 reader_59 reader_61 
#    3100      3100      3100      3100

```

```{r}

motr_provo_df = motr_agg_df %>%
  filter(expr_id == "Provo") %>%
  rename(text_id = para_nr,
         word_text_idx = word_nr,
         motr_value = value) %>%
  dplyr::select(-expr_id, -cond_id)

motr_provo_df

```



# Comparison to Provo

```{r}
# Read in Provo surprisal, frequency and length data
provo_modeling_df = read.csv("../ancillary_data/provo_df.csv") %>%
  dplyr::select(text_id, sent_id, trigger_idx, word, freq, surp, len) %>%
  rename(word_idx = trigger_idx)

provo_modeling_df

```

```{r}
# Read in Provo eyetracking data

provo_raw_df = read.csv("../ancillary_data/provo_eyetracking.csv")

provo_eyetracking_df = provo_raw_df %>%
  dplyr::select(Participant_ID, Text_ID, Sentence_Number, Word_In_Sentence_Number, IA_ID, Word,IA_FIRST_FIXATION_DURATION, IA_FIRST_FIX_PROGRESSIVE, IA_FIRST_RUN_DWELL_TIME, IA_DWELL_TIME, IA_REGRESSION_PATH_DURATION, IA_REGRESSION_OUT) %>%
  rename( first_duration = IA_FIRST_FIXATION_DURATION,    # whether it is first pass?
          gaze_duration = IA_FIRST_RUN_DWELL_TIME,
          total_duration = IA_DWELL_TIME,
          go_past_time = IA_REGRESSION_PATH_DURATION,
          FPReg = IA_REGRESSION_OUT,
          subj = Participant_ID,
          text_id = Text_ID,
          sent_id = Sentence_Number,
          word_idx = Word_In_Sentence_Number,
          word_text_idx = IA_ID,
          word = Word,
          ff_progressive = IA_FIRST_FIX_PROGRESSIVE) %>% # notice:average across subj, binary(0,1) becomes float.
  mutate(gaze_duration = ifelse(ff_progressive == 0, 0, gaze_duration),
         go_past_time = ifelse(ff_progressive == 0, 0, go_past_time)) %>%
  dplyr::select(-ff_progressive) %>%
  gather(metric, value, 7:11) %>%      # not include FFReg which is in column 11
  mutate(value = if_else(is.na(value), as.integer(0), as.integer(value))) %>%
  drop_na() %>%   # actually, drop first word in a sentence
  group_by(text_id, word_text_idx, sent_id, word_idx, word, metric) %>%
  summarise(value = mean(value)) %>%
  ungroup()

# View(provo_eyetracking_df)
provo_eyetracking_df

```


```{r}
provo_df = merge(provo_eyetracking_df, provo_modeling_df, by=c("text_id", "sent_id", "word_idx")) %>%
  mutate(word_text_idx = as.integer(word_text_idx - 1)) %>%
  arrange(text_id, sent_id, word_idx)
provo_df

provo_df = merge(provo_df, motr_provo_df, by=c("text_id", "word_text_idx", "metric")) %>%
  rename(eyetr_value = value) %>%
  arrange(text_id, sent_id, word_idx) %>%
  filter(word.x == word) %>%      #word.y has no captical word
  dplyr::select(-word.x, -word.y) %>%
  mutate(motr_outlier = if_else(motr_value > (mean(motr_value) + 3 * sd(motr_value) ), T, F)) %>%
  filter(motr_outlier == F) %>%     # clear outlier -> 13 was filtered.
  gather(measure, value, c("eyetr_value", "motr_value"))  %>%
  # filter(metric != "first_duration") %>%
  filter(metric != "FPReg")

# View(provo_df)


```


```{r}
provo_df %>%
  ggplot(aes(x = value, color=metric)) +
    geom_density() +
    facet_wrap(.~measure) +
    xlab("Reading Measure")
# ggsave("../visualization/density.png", device = "png", width = 6, height = 2.5)
```

```{r}
provo_df %>%
  filter(measure == "motr_value") %>%
  ggplot(aes(x = value, color=metric)) +
    geom_density() +
    xlab("Reading Measure") +
    ggtitle("Density plot of value for motr")
```

```{r}
provo_df %>%
  filter(measure == "eyetr_value") %>%
  ggplot(aes(x = value, color=metric)) +
    geom_density() +
    xlab("Reading Measure") +
    ggtitle("Density plot of value for eyetr")
```


```{r}
gd_df = provo_df %>% filter(metric == "total_duration") %>% spread(measure, value)
# View(gd_df)

cor.test(gd_df$eyetr_value, gd_df$motr_value)          #cor rise from 0.529 to 0.5989 

provo_df %>%
  spread(measure, value) %>%
  ggplot(aes(x = motr_value, y=eyetr_value, color=metric)) +
    geom_point(alpha = 0.2) +
    facet_wrap(.~metric, scales="free") +
    geom_smooth()

# ggsave("../visualization/metric_cor.png", device = "png", width = 6, height = 2.5)

```

```{r}
gd_df_2 = provo_df %>% filter(metric == "first_duration") %>% spread(measure, value)

cor.test(gd_df_2$eyetr_value, gd_df_2$motr_value)
```

```{r}
gd_df_3 = provo_df %>% filter(metric == "gaze_duration") %>% spread(measure, value)

cor.test(gd_df_3$eyetr_value, gd_df_3$motr_value)
```


```{r}
gd_df_4 = provo_df %>% filter(metric == "go_past_time") %>% spread(measure, value)

cor.test(gd_df_4$eyetr_value, gd_df_4$motr_value)

```
```{r}
provo_df %>%
  gather(word_prop, word_prop_val, c("freq", "len", "surp")) %>%
  filter(metric == "gaze_duration") %>%
  ggplot(aes(x = value, y=word_prop_val, color = measure)) +
    geom_point(alpha = 0.2) +
    facet_grid(word_prop~measure, scales="free") +
    geom_smooth() +
    xlab("Reading Measure")

# ggsave("../visualization/word_prop_comps.png", device = "png", width = 6, height = 3)
```

```{r}
provo_df %>%
  ggplot(aes(x = value, y=freq, color=metric)) +
    geom_point(alpha = 0.2) +
    facet_grid(metric~measure, scales="free") +
    geom_smooth()
```

```{r}
provo_df %>%
  ggplot(aes(x = value, y=surp, color=metric)) +
    geom_point(alpha = 0.2) +
    facet_grid(metric~measure, scales="free") +
    geom_smooth()
```



## Shape of surprisal / RT relationship

```{r}

fit_gam_inner = function(bootstrap_sample, mean_predictors) {
  
  df = bootstrap_sample$data
  weights = tabulate(as.integer(bootstrap_sample), nrow(df))
  
  m = gam(psychometric ~ s(surp, bs = 'cr', k = 6) + s(prev_surp, bs = 'cr', k = 6) + te(freq, len, bs = 'cr') + te(prev_freq, prev_len, bs = 'cr'), data = df, weights = weights)
  terms_to_predict = c("s(surp)", "s(prev_surp)")
  
  newdata = data.frame(surp=seq(0,20,by=0.1), prev_surp=mean_predictors$surp,
                       freq=mean_predictors$freq, prev_freq=mean_predictors$freq,
                       len=mean_predictors$freq, prev_len=mean_predictors$freq)

  # Returns a matrix N_samples * N_terms.
  per_term_predictions = predict(m, newdata=newdata, terms=terms_to_predict, type="terms")

  # Additive model -- sum across predictor response contributions (matrix columns).
  predictions = rowSums(per_term_predictions)

  return(newdata %>% mutate(y=predictions))
}

fit_gam = function(df, mean_predictors, alpha=0.05) {
  # Bootstrap-resample data
  boot_models = df %>% bootstraps(times=10) %>% 
   # Fit a GAM and get predictions for each sample
    mutate(smoothed=map(splits, fit_gam_inner, mean_predictors=mean_predictors))
  
  # Extract mean and 5% and 95% percentile y-values for each surprisal value
  result = boot_models %>% 
    unnest(smoothed) %>% 
    dplyr::select(surp, y) %>% 
    group_by(surp) %>% 
      summarise(y_lower=quantile(y, alpha / 2), 
                y_upper=quantile(y, 1 - alpha / 2),
                y=mean(y)) %>% 
    ungroup()
  
  return (result)
}


```


```{r}

gam_modeling_df = provo_df %>%
  spread(measure, value) %>%
  mutate(len = nchar(word)) %>%
  group_by(metric, text_id) %>%
    arrange(word_text_idx) %>%
    mutate(prev_surp = lag(surp),
           prev_freq = lag(freq),
           prev_len = lag(len),
           prev_eyetr_value = lag(eyetr_value)) %>%
  ungroup() %>%
  drop_na() %>%
  rename(psychometric = motr_value)


smooths_df = data.frame()

metrics = c("gaze_duration", "total_duration", "go_past_time", "first_duration")
for (m in metrics) {
  print(paste0("Fitting model for ", m))
  dummy_df = gam_modeling_df %>% filter(metric == m)
  mean_predictors = dummy_df %>% summarise(surp = mean(surp), len = mean(len), freq = mean(freq))
  smooths = dummy_df %>% fit_gam(., mean_predictors)
  #Fix 0 surprisal = 0 ms
  gam_smooths = smooths %>% mutate(delta = 0 - y[1], y=y + delta, y_lower= y_lower + delta, y_upper=y_upper + delta)
  smooths_df = rbind(smooths_df, gam_smooths %>% mutate(psychometric = m))
}

```


```{r}
# Surprisal curves
  ggplot() +
      geom_line(data = smooths_df, aes(x=surp, y=y, color = psychometric), size=0.7) +
      geom_ribbon(data = smooths_df, aes(x=surp, ymin=y_lower, ymax=y_upper, fill = psychometric), alpha=0.3, size=0.5) +
      scale_x_continuous(labels=c(0, 10, 20), breaks=c(0, 10, 20), minor_breaks = NULL) +
      facet_wrap(psychometric~.) +
      ylab("Slowdown due to Surprisal (ms)") +
      xlab("Surprisal of Word") +
      ggtitle("Relationship between MoTR Times and Surprisal")
  theme(
    legend.position = "none",
    panel.grid.minor = element_blank()
  )
```






```{r}





```

# Targeted Evaluation Data



```{r}
# View(motr_df)

motr_attach_df = motr_df %>%
  filter(expr_id == "Attachment") %>%
  rename( item_id = para_nr) %>%
  mutate(item_id = as.integer(item_id)) %>%
  mutate(cond_id = as.factor(cond_id)) %>%
  mutate(cond_id = if_else(cond_id == 1, "No Comma", 
                          if_else(cond_id == 2, "Comma", 
                                  if_else(cond_id == 3, "adv_high",
                                          if_else(cond_id == 4, "adv_low",
                                                  if_else(cond_id == 5, "relative_high",
                                                          if_else(cond_id == 6, "relative_low",
                                                                  if_else(cond_id == 7, "practice", "filler")))))))) %>%
  filter(metric != "FPReg") %>%
  filter(cond_id == "relative_high" | cond_id == "relative_low") %>%

  # filter(! (item_id == 4 & cond_id == "No Comma") ) %>% # just because of alignment issues for now
  
  mutate(crit = if_else(word == "who", word_nr, as.integer(0) )) %>%
  group_by(cond_id, item_id) %>%
    mutate(crit = unique(crit)[2]) %>%
  ungroup() %>%
  mutate(word_nr = word_nr - crit)
View(motr_attach_df)


agg_motr_attach_df = motr_attach_df %>%
  drop_na() %>%
  filter(word_nr >= -2, word_nr < 6) %>%
  group_by(cond_id, word_nr, metric) %>%
    summarise( m = mean(value),
               sd = std.error(value),
               upper = m + 1.96 * sd,
               lower = m - 1.98 * sd,
               n = n()) %>%
  ungroup()

View(agg_motr_attach_df)

agg_motr_attach_df %>%
  ggplot(aes(x = word_nr, y = m, color = cond_id)) +
    geom_rect(aes(xmin = 2.5, xmax = 5.5, ymin = 100, ymax = 800), fill=alpha("white", 0), color = "#45ef70", linetype = "dotted") +
    geom_point() +
    geom_errorbar(aes(ymax = upper, ymin = lower), width = 0.3) +
    geom_line() +
    #geom_text(aes(label = word, y = if_else(cond_id == "Comma", 3000, 3500)), size = 2) +
    #facet_grid(para_nr~cond_id) +
  ylab("Reading Time") +
  xlab("Condition") +
  # scale_x_continuous(breaks=-2:5, labels=c("the", "man", "and", "his", "wife", "ran", "away", "from")) +
  scale_x_continuous(breaks=-2:5, labels=c('the', 'queen', 'who', 'praised', 'herself', 'all', 'the', 'time')) +
  facet_grid(~metric) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

#ggsave("../visualization/attachment.png", device = "png", width = 6, height = 3)

```

```{r}

options(JULIA_HOME = "/Applications/Julia-1.8.app/Contents/Resources/julia/bin/")
# library(jglmm)
# jglmm_setup()

attach_lm_df = motr_attach_df %>%
  filter(metric == "gaze_duration") %>%
  filter(word_nr == 3) %>%
  mutate(item_id = as.factor(item_id),
         subj = as.factor(subj))

m = attach_lm_df %>%
  lmer(value ~ cond_id + (cond_id | item_id) + (cond_id | subj), data=.)

summary(m)

```




